Image('ExampleRandomForest.png')
An example is to train a few classifiers (knn, logistic reg, etc) and aggregate the predictions of each classifier and predict the class that gets the most votes.
Decision Tree work great with the data used to create and train the model. However, they are inflexible when it comes to new data.
Bootstrapping our data and using the aggregate of our tally to make a decision is called bagging
This algorithym searches for the pair (k, tk) that produces the purest subsets (weighted by their size). The cost function that the algorithm tries to minimize is as follows:
$$J(k,t_k) = \frac{m_{left}}{m}G_{left} + \frac{m_{right}}{m}G_{right}$$$G_{left/right}$ measures the impurity of the left/right subset,
$m_{left/right}$ is the number of instances in the left/right subset.
where: $$MSE_{node} = \sum_{i \in node} (\hat{y}_{node} - y^{(i)})^2$$ $$\hat{y}_{node} = \frac{1}{m_{node}}\sum_{i \in node} (y^{(i)})$$
To get more variability in our trees, only consider a random subset of explanatory variables at each step.
$$\frac{6}{7+6}0 + \frac{7}{7+6}0.49$$
$$\frac{4}{4+9}0 + \frac{9}{4+9}0.49$$
Image('RFrootNode.png')
Repeat steps one n number of times to build a forest of decision trees.
Image('ExampleRandomForest.png')
Estimate the accuracy of the random forest:
Select most accurate random forest based upon Out-of-Bag error.
Image('ExampleRandomForest.png')
Libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_regression
import matplotlib.lines as mlines
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.tree import DecisionTreeClassifier
import seaborn as sns
import pandas as pd
import scipy
from IPython.display import display
from IPython.display import Image
from sklearn.datasets import make_regression
from sklearn import tree
import graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from mlxtend.plotting import plot_decision_regions
Dataset:
#create dataset
boost_classifier = pd.DataFrame({'Masters':['Yes','Yes','No','Yes','No','No','No','Yes','Yes','No','No','Yes','No'],
'Age':[22,57,44,29,32,24,27,30,31,50,47,39,46],
'Language': ['Python','R','Python','C+','R','Python','Python','Python','R','C+','Python','R','R'],
'Accepted': [1,1,0,0,1,1,0,1,0,0,1,1,1]})
#set seed
np.random.seed(40)
#random sampling with replacement
bootstrapped = boost_classifier.sample(replace = True, n =13)
#out-of-bag
ofb = boost_classifier[~boost_classifier.index.isin(bootstrapped.index)]
Root Node:
#create dataset
bootstrapped = pd.DataFrame({'Masters':['No','No','No','No','Yes','Yes','Yes','Yes','Yes','No','Yes','No','No'],
'Age':[24,47,27,32,30,30,57,22,39,27,57,44,50],
'Language': ['Python','Python','Python','R','Python','Python','R','Python','R','Python','R','Python','C+'],
'Accepted': [1,1,0,1,1,1,1,1,1,0,1,0,0]})
subset_split = bootstrapped[['Masters','Language','Accepted']]
dummy_subset = pd.get_dummies(subset_split)
#Visualize Tree
X = dummy_subset.drop(['Accepted'],axis =1).values
y = dummy_subset['Accepted'].values
fn = dummy_subset.columns[1:]
clf = DecisionTreeClassifier(max_depth =1)
clf = clf.fit(X,y)
dot_data = tree.export_graphviz(clf, out_file=None,
feature_names=fn,
class_names=['Yes','No'],
filled=True, rounded=True,
special_characters=True)
r = graphviz.Source(dot_data)
r.format = 'png'
r.filename = 'RFrootNode'
r.render()
RandomForest Forest:
boost_classifier = pd.DataFrame({'Masters':['Yes','Yes','No','Yes','No','No','No','Yes','Yes','No','No','Yes','No'],
'Age':[22,57,44,29,32,24,27,30,31,50,47,39,46],
'Language': ['Python','R','Python','C+','R','Python','Python','Python','R','C+','Python','R','R'],
'Accepted': [1,1,0,0,1,1,0,1,0,0,1,1,1]})
dummy_subset = pd.get_dummies(boost_classifier)
#Visualize Tree
X = dummy_subset.drop(['Accepted'],axis =1).values
y = dummy_subset['Accepted'].values
fn = dummy_subset.columns[[0,2,3,4,5,6]]
cn=['Yes','No']
clf = RandomForestClassifier()
clf = clf.fit(X,y)
fig, axes = plt.subplots(nrows = 1,ncols = 4,figsize = [12,5], dpi=900)
for index in range(0, 4):
tree.plot_tree(clf.estimators_[index],
filled = True,
feature_names = fn,
class_names=cn,
ax = axes[index]);
axes[index].set_title('Estimator: ' + str(index), fontsize = 11)
fig.savefig('ExampleRandomForest.png')
Voting Classifier:
# Splitting our dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)
# We want to have three different classification algorithyms
logreg_clf = LogisticRegression()
random_clf = RandomForestClassifier()
svm_clf = SVC(probability =True)
# In our voting classifier we want to add our three classification algorithyms and have hard voting (most votes)
voting_clf = VotingClassifier(
estimators=[('lr', logreg_clf), ('rf', random_clf), ('svc', svm_clf)],
voting='soft')
# training on our dataset
voting_clf.fit(X_train, y_train)
# each individual accuracy from our three classifiers
for clf in (logreg_clf, random_clf, svm_clf, voting_clf):
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(clf.__class__.__name__, round(accuracy_score(y_test, y_pred),2))
Feature Importance:
boost_classifier = pd.DataFrame({'Masters':['Yes','Yes','No','Yes','No','No','No','Yes','Yes','No','No','Yes','No'],
'Age':[22,57,44,29,32,24,27,30,31,50,47,39,46],
'Language': ['Python','R','Python','C+','R','Python','Python','Python','R','C+','Python','R','R'],
'Accepted': [1,1,0,0,1,1,0,1,0,0,1,1,1]})
# Dummy Encoding
dummy_subset = pd.get_dummies(boost_classifier)
#Passing our x and y variables
X = dummy_subset.drop(['Accepted'],axis =1).values
y = dummy_subset['Accepted'].values
#Training our classifier
forest_clf = RandomForestClassifier(n_estimators=300, n_jobs=-1)
forest_clf.fit(X, y)
for name, score in zip(dummy_subset.columns[[0,2,3,4,5,6]],forest_clf.feature_importances_):
print(name, score)
# We can see from our feature importance Age significantly stands out